#Doing the Recency Frequency and monetory analysis of ecommerce dataset

df=read.csv("d:/Ecommerce Project 2.csv")
head(df)
##   InvoiceNo StockCode                         Description Quantity InvoiceDate
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6   29-Nov-16
## 2    536365     71053                 WHITE METAL LANTERN        6   29-Nov-16
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8   29-Nov-16
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6   29-Nov-16
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6   29-Nov-16
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2   29-Nov-16
##   UnitPrice CustomerID        Country  X
## 1      2.55      17850 United Kingdom NA
## 2      3.39      17850 United Kingdom NA
## 3      2.75      17850 United Kingdom NA
## 4      3.39      17850 United Kingdom NA
## 5      3.39      17850 United Kingdom NA
## 6      7.65      17850 United Kingdom NA
#displaying the stucture , the datatypes of ecom data frame
str(df)
## 'data.frame':    541909 obs. of  9 variables:
##  $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr  "29-Nov-16" "29-Nov-16" "29-Nov-16" "29-Nov-16" ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ X          : logi  NA NA NA NA NA NA ...

#need to extract numbers from InvoiceNo,Stockcode,CustomerID #eliminating characters from invoice_no

inv=gsub('[A-Z]*','',df$InvoiceNo)
inv=as.numeric(inv)
df$InvoiceNo=inv

#eliminating characters from CustomerID

cid=gsub("[^[:alnum:]]", "",df$CustomerID)
cid=as.numeric(cid)
df$CustomerID=cid
#reviewing the structure of the data frame again
str(df)
## 'data.frame':    541909 obs. of  9 variables:
##  $ InvoiceNo  : num  536365 536365 536365 536365 536365 ...
##  $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr  "29-Nov-16" "29-Nov-16" "29-Nov-16" "29-Nov-16" ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num  17850 17850 17850 17850 17850 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ X          : logi  NA NA NA NA NA NA ...
#Analysing the missing values

df_missing=sapply(df,function(x)(sum(is.na(x))))
df_missing[df_missing>0]
## CustomerID          X 
##     135080     541909
# dropping the column x as it has no values
df_missing
##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##           0           0           0           0           0           0 
##  CustomerID     Country           X 
##      135080           0      541909
df=df[,-9]


#removing the rows with null customer id
df=na.omit(df)
df_missing=sapply(df,function(x)(sum(is.na(x))))
df_missing[df_missing>0]
## named integer(0)
dim(df)
## [1] 406829      8

#creating a column total_price as quantity*unit_price

df$tot_price=df$Quantity*df$UnitPrice

str(df)
## 'data.frame':    406829 obs. of  9 variables:
##  $ InvoiceNo  : num  536365 536365 536365 536365 536365 ...
##  $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr  "29-Nov-16" "29-Nov-16" "29-Nov-16" "29-Nov-16" ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num  17850 17850 17850 17850 17850 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ tot_price  : num  15.3 20.3 22 20.3 20.3 ...
##  - attr(*, "na.action")= 'omit' Named int [1:135080] 623 1444 1445 1446 1447 1448 1449 1450 1451 1452 ...
##   ..- attr(*, "names")= chr [1:135080] "623" "1444" "1445" "1446" ...
View(df)
library(plyr)
## Warning: package 'plyr' was built under R version 4.0.5
df_country=count(df$Country)


df_country=df_country[order(df_country$freq),]
head(df_country)
##                 x freq
## 29   Saudi Arabia   10
## 3         Bahrain   17
## 9  Czech Republic   30
## 5          Brazil   32
## 22      Lithuania   35
## 21        Lebanon   45
#converting country to factor variables

df$Country=as.factor(df$Country)

#creating columns for calculating recency freqiuency and monetory analysis

#grouping the  data by customer id ,how much the customer has spend


library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df_price=df%>%
  group_by(CustomerID)%>%
  summarise("tot"=sum(tot_price))
head(df_price)
## # A tibble: 6 x 2
##   CustomerID   tot
##        <dbl> <dbl>
## 1      12346    0 
## 2      12347 4310 
## 3      12348 1797.
## 4      12349 1758.
## 5      12350  334.
## 6      12352 1545.
nrow(df_price)
## [1] 4372

#grouping the data by customer id ,based on how many times they have transacted

library(dplyr)
# df_inv_cid=df %>%
# group_by(CustomerID)%>%
# count(InvoiceNo)



df_txn=df %>%
  group_by(CustomerID) %>%
  summarise(count=n())
head(df_txn)
## # A tibble: 6 x 2
##   CustomerID count
##        <dbl> <int>
## 1      12346     2
## 2      12347   182
## 3      12348    31
## 4      12349    73
## 5      12350    17
## 6      12352    95
nrow(df_txn)
## [1] 4372

#separating date into day month and year

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x dplyr::rename()    masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
df2=separate(df,"InvoiceDate",into=c("Day","Month","Year"),sep="-")
head(df2)
##   InvoiceNo StockCode                         Description Quantity Day Month
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6  29   Nov
## 2    536365     71053                 WHITE METAL LANTERN        6  29   Nov
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8  29   Nov
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6  29   Nov
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6  29   Nov
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2  29   Nov
##   Year UnitPrice CustomerID        Country tot_price
## 1   16      2.55      17850 United Kingdom     15.30
## 2   16      3.39      17850 United Kingdom     20.34
## 3   16      2.75      17850 United Kingdom     22.00
## 4   16      3.39      17850 United Kingdom     20.34
## 5   16      3.39      17850 United Kingdom     20.34
## 6   16      7.65      17850 United Kingdom     15.30
unique(df2$Month)
##  [1] "Nov" "Dec" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct"

#the total transactions by month

library(dplyr)
install.packages("dplyr")
## Warning: package 'dplyr' is in use and will not be installed
df2%>%
  group_by(Month)%>%
  summarise(sum(tot_price))
## # A tibble: 12 x 2
##    Month `sum(tot_price)`
##    <chr>            <dbl>
##  1 Apr            409698.
##  2 Aug            643654.
##  3 Dec            717385.
##  4 Feb            434829.
##  5 Jan            517833.
##  6 Jul            602283.
##  7 Jun            576932.
##  8 Mar            562237.
##  9 May            684053.
## 10 Nov           1245328.
## 11 Oct           1029836.
## 12 Sep            875996.
df2%>%
  group_by(Year)%>%
  summarise(sum(tot_price))
## # A tibble: 2 x 2
##   Year  `sum(tot_price)`
##   <chr>            <dbl>
## 1 16             554604.
## 2 17            7745462.

#calculating recency of each customer

head(df)
##   InvoiceNo StockCode                         Description Quantity InvoiceDate
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6   29-Nov-16
## 2    536365     71053                 WHITE METAL LANTERN        6   29-Nov-16
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8   29-Nov-16
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6   29-Nov-16
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6   29-Nov-16
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2   29-Nov-16
##   UnitPrice CustomerID        Country tot_price
## 1      2.55      17850 United Kingdom     15.30
## 2      3.39      17850 United Kingdom     20.34
## 3      2.75      17850 United Kingdom     22.00
## 4      3.39      17850 United Kingdom     20.34
## 5      3.39      17850 United Kingdom     20.34
## 6      7.65      17850 United Kingdom     15.30
df$InvoiceDate=as.Date(df$InvoiceDate,format="%d-%b-%y")
max(df$InvoiceDate)
## [1] "2017-12-07"
nrow(df)
## [1] 406829
library(dplyr)
df_date=df%>%
  group_by(CustomerID)%>%
  summarise(max(InvoiceDate))

nrow(df_date)
## [1] 4372
df_date$diff_in_days = difftime( max(df$InvoiceDate),df_date$`max(InvoiceDate)`, units = "days")

df_date
## # A tibble: 4,372 x 3
##    CustomerID `max(InvoiceDate)` diff_in_days
##         <dbl> <date>             <drtn>      
##  1      12346 2017-01-16         325 days    
##  2      12347 2017-12-05           2 days    
##  3      12348 2017-09-23          75 days    
##  4      12349 2017-11-19          18 days    
##  5      12350 2017-01-31         310 days    
##  6      12352 2017-11-01          36 days    
##  7      12353 2017-05-17         204 days    
##  8      12354 2017-04-19         232 days    
##  9      12355 2017-05-07         214 days    
## 10      12356 2017-11-15          22 days    
## # ... with 4,362 more rows

#combining recency, frequency and monetory parameters in a single data frame

rfm_matrix=data.frame(cbind("custid"=df_price$CustomerID,"recency"=df_date$diff_in_days,"frequency"=df_txn$count,"monetory"=df_price$tot))
nrow(rfm_matrix)
## [1] 4372
head(rfm_matrix)
##   custid recency frequency monetory
## 1  12346     325         2     0.00
## 2  12347       2       182  4310.00
## 3  12348      75        31  1797.24
## 4  12349      18        73  1757.55
## 5  12350     310        17   334.40
## 6  12352      36        95  1545.41
#install.packages("OneR")
library("OneR")
## Warning: package 'OneR' was built under R version 4.0.5
head(rfm_matrix$monetory)
## [1]    0.00 4310.00 1797.24 1757.55  334.40 1545.41
min(rfm_matrix$monetory)
## [1] -4287.63
max(rfm_matrix$monetory)
## [1] 279489
#creating bins for each variables recency, frequency and monetory

#creating bins for monetory 

summary(rfm_matrix$monetory)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  -4287.6    293.4    648.1   1898.5   1611.7 279489.0
b <- c(-Inf,293.4,648.1,1898.5,1611.7,Inf)
names <- c("1", "2", "3","4","5")
rfm_matrix<-rfm_matrix%>%mutate(mon_bins = cut(monetory, breaks = b,labels=names))
head(rfm_matrix)
##   custid recency frequency monetory mon_bins
## 1  12346     325         2     0.00        1
## 2  12347       2       182  4310.00        5
## 3  12348      75        31  1797.24        4
## 4  12349      18        73  1757.55        4
## 5  12350     310        17   334.40        2
## 6  12352      36        95  1545.41        3
#creating bins for recency

summary(rfm_matrix$recency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   16.00   50.00   91.58  143.00  373.00
b <- c(-Inf,16,50,91.58,143,Inf)
names <- c("5", "4", "3","2","1")
rfm_matrix<-rfm_matrix%>%mutate(rec_bins = cut(recency, breaks = b,labels=names))
head(rfm_matrix)
##   custid recency frequency monetory mon_bins rec_bins
## 1  12346     325         2     0.00        1        1
## 2  12347       2       182  4310.00        5        5
## 3  12348      75        31  1797.24        4        3
## 4  12349      18        73  1757.55        4        4
## 5  12350     310        17   334.40        2        1
## 6  12352      36        95  1545.41        3        4
#creating bins for frequency

summary(rfm_matrix$frequency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   17.00   42.00   93.05  102.00 7983.00
b <- c(-Inf,17,42,93.5,102,Inf)
names <- c("1", "2", "3","4","5")
rfm_matrix<-rfm_matrix%>%mutate(freq_bins = cut(frequency, breaks = b,labels=names))
head(rfm_matrix)
##   custid recency frequency monetory mon_bins rec_bins freq_bins
## 1  12346     325         2     0.00        1        1         1
## 2  12347       2       182  4310.00        5        5         5
## 3  12348      75        31  1797.24        4        3         2
## 4  12349      18        73  1757.55        4        4         3
## 5  12350     310        17   334.40        2        1         1
## 6  12352      36        95  1545.41        3        4         4
rfm_bin=rfm_matrix[,c(1,5,7,6)]

head(rfm_bin)
##   custid mon_bins freq_bins rec_bins
## 1  12346        1         1        1
## 2  12347        5         5        5
## 3  12348        4         2        3
## 4  12349        4         3        4
## 5  12350        2         1        1
## 6  12352        3         4        4
str(rfm_bin)
## 'data.frame':    4372 obs. of  4 variables:
##  $ custid   : num  12346 12347 12348 12349 12350 ...
##  $ mon_bins : Factor w/ 5 levels "1","2","3","4",..: 1 5 4 4 2 3 1 3 2 5 ...
##  $ freq_bins: Factor w/ 5 levels "1","2","3","4",..: 1 5 2 3 1 4 1 3 1 3 ...
##  $ rec_bins : Factor w/ 5 levels "5","4","3","2",..: 5 1 3 2 5 2 5 5 5 2 ...
rfm_bin$mon_bins=as.numeric(rfm_bin$mon_bins)
rfm_bin$rec_bins=as.numeric(rfm_bin$rec_bins)
rfm_bin$freq_bins=as.numeric(rfm_bin$freq_bins)
head(rfm_bin)
##   custid mon_bins freq_bins rec_bins
## 1  12346        1         1        5
## 2  12347        5         5        1
## 3  12348        4         2        3
## 4  12349        4         3        2
## 5  12350        2         1        5
## 6  12352        3         4        2
head(rfm_bin)
##   custid mon_bins freq_bins rec_bins
## 1  12346        1         1        5
## 2  12347        5         5        1
## 3  12348        4         2        3
## 4  12349        4         3        2
## 5  12350        2         1        5
## 6  12352        3         4        2
str(rfm_bin)
## 'data.frame':    4372 obs. of  4 variables:
##  $ custid   : num  12346 12347 12348 12349 12350 ...
##  $ mon_bins : num  1 5 4 4 2 3 1 3 2 5 ...
##  $ freq_bins: num  1 5 2 3 1 4 1 3 1 3 ...
##  $ rec_bins : num  5 1 3 2 5 2 5 5 5 2 ...
head(rfm_bin)
##   custid mon_bins freq_bins rec_bins
## 1  12346        1         1        5
## 2  12347        5         5        1
## 3  12348        4         2        3
## 4  12349        4         3        2
## 5  12350        2         1        5
## 6  12352        3         4        2
str(rfm_bin)
## 'data.frame':    4372 obs. of  4 variables:
##  $ custid   : num  12346 12347 12348 12349 12350 ...
##  $ mon_bins : num  1 5 4 4 2 3 1 3 2 5 ...
##  $ freq_bins: num  1 5 2 3 1 4 1 3 1 3 ...
##  $ rec_bins : num  5 1 3 2 5 2 5 5 5 2 ...
rfm_bin$tot_score=(rfm_bin$rec_bins+rfm_bin$freq_bins+rfm_bin$mon_bins)/3
nrow(rfm_bin)
## [1] 4372
nrow(df)
## [1] 406829

#clustering based on RFM analysis

#Creating an elbow plot

n=1:15
wss=function(k){
  kmod=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],k)
  kmod$tot.withinss
  
}


library(tidyverse)
wss_values=map_dbl(n,wss)
wss_values
##  [1] 28750.382 13685.063 11932.020  7141.264  6547.659  5536.572  4900.850
##  [8]  4491.803  4358.161  3472.163  3376.190  3342.792  2739.297  3221.899
## [15]  2949.219
plot(n, wss_values,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     col="red",
     ylab="Total within-clusters sum of squares")

#making clusters by k=3

k_mod3=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=3)



df_mod3=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod3$cluster)
df_mod3$cluster=as.factor(df_mod3$cluster)


head(df_mod3)
##   rec_bins freq_bins mon_bins cluster
## 1        5         1        1       3
## 2        1         5        5       2
## 3        3         2        4       1
## 4        2         3        4       2
## 5        5         1        2       3
## 6        2         4        3       2

Accuracy of the cluster with number of clusters 3 =0.6963555

#making clusters by k=4

head(rfm_bin)
##   custid mon_bins freq_bins rec_bins tot_score
## 1  12346        1         1        5  2.333333
## 2  12347        5         5        1  3.666667
## 3  12348        4         2        3  3.000000
## 4  12349        4         3        2  3.000000
## 5  12350        2         1        5  2.666667
## 6  12352        3         4        2  3.000000
k_mod4=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=4)

df_mod4=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod4$cluster)
df_mod4$cluster=as.factor(df_mod4$cluster)

head(df_mod4,20)
##    rec_bins freq_bins mon_bins cluster
## 1         5         1        1       3
## 2         1         5        5       2
## 3         3         2        4       4
## 4         2         3        4       4
## 5         5         1        2       3
## 6         2         4        3       2
## 7         5         1        1       3
## 8         5         3        3       3
## 9         5         1        2       3
## 10        2         3        5       4
## 11        2         5        5       2
## 12        1         2        3       1
## 13        1         5        5       2
## 14        3         5        5       2
## 15        5         1        1       3
## 16        1         5        5       2
## 17        4         2        2       3
## 18        1         3        3       1
## 19        5         2        2       3
## 20        1         1        1       1

Accuracy of the cluster with number of clusters 4 =0.7284862

#making clusters by k=6

set.seed(100)

k_mod6=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=6)

df_mod6=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod6$cluster)
df_mod6$cluster=as.factor(df_mod6$cluster)



head(df_mod6)
##   rec_bins freq_bins mon_bins cluster
## 1        5         1        1       1
## 2        1         5        5       6
## 3        3         2        4       3
## 4        2         3        4       3
## 5        5         1        2       1
## 6        2         4        3       2
head(rfm_bin)
##   custid mon_bins freq_bins rec_bins tot_score
## 1  12346        1         1        5  2.333333
## 2  12347        5         5        1  3.666667
## 3  12348        4         2        3  3.000000
## 4  12349        4         3        2  3.000000
## 5  12350        2         1        5  2.666667
## 6  12352        3         4        2  3.000000

Accuracy of the cluster with number of clusters 6=0.8156666

#scaling

summary(df_mod6)
##     rec_bins      freq_bins       mon_bins     cluster 
##  Min.   :1.00   Min.   :1.00   Min.   :1.000   1:1054  
##  1st Qu.:1.00   1st Qu.:1.00   1st Qu.:1.750   2: 690  
##  Median :2.00   Median :2.00   Median :2.500   3: 212  
##  Mean   :2.82   Mean   :2.76   Mean   :2.711   4: 574  
##  3rd Qu.:4.00   3rd Qu.:4.00   3rd Qu.:3.250   5: 831  
##  Max.   :5.00   Max.   :5.00   Max.   :5.000   6:1011
df_mod6_scale=scale(df_mod6[,c("rec_bins","freq_bins","mon_bins")],center=TRUE,scale=TRUE)

summary(df_mod6_scale)
##     rec_bins         freq_bins          mon_bins      
##  Min.   :-1.1989   Min.   :-1.1800   Min.   :-1.1951  
##  1st Qu.:-1.1989   1st Qu.:-1.1800   1st Qu.:-0.6712  
##  Median :-0.5400   Median :-0.5097   Median :-0.1472  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7778   3rd Qu.: 0.8310   3rd Qu.: 0.3768  
##  Max.   : 1.4367   Max.   : 1.5014   Max.   : 1.5994
kmod_61=kmeans(df_mod6_scale,centers=6)

Accuracy of the cluster with number of clusters 6 after normalisation=0.8232957

Using the factoextra R package

The function fviz_cluster() [factoextra package] can be used to easily visualize k-means clusters. It takes k-means results and the original data as arguments.

#In the resulting plot, observations are represented by points, # using principal components if the number of variables is greater than 2. #It’s also possible to draw concentration ellipse around each cluster.

#install.packages("ggpubr")
#install.packages("factoextra")
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.0.5
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#Compute PCA and extract individual coordinates

Dimension reduction using PCA

res.pca <- prcomp(df_mod6[,c("rec_bins","freq_bins","mon_bins")],  scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(df_mod6$cluster)
# Data inspection


k_mod6=kmeans(rfm_bin[,c("mon_bins","rec_bins","freq_bins")],centers=6)
df_mod6=cbind(rfm_bin[,c("mon_bins","rec_bins","freq_bins")],"cluster"=k_mod6$cluster)
df_mod6$cluster=as.factor(df_mod6$cluster)
head(df_mod6)
##   mon_bins rec_bins freq_bins cluster
## 1        1        5         1       5
## 2        5        1         5       4
## 3        4        3         2       6
## 4        4        2         3       6
## 5        2        5         1       5
## 6        3        2         4       3
res.pca <- prcomp(df_mod6[,c("mon_bins","rec_bins","freq_bins")],  scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(df_mod6$cluster)

Data inspection

eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1        2.1             71.5                        71.5
## Dim.2        0.6             20.5                        92.0
## Dim.3        0.2              8.0                       100.0
ggscatter(
  ind.coord, x = "Dim.1", y = "Dim.2", 
  color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
  size = 1.5,  legend = "right", ggtheme = theme_bw(),
  xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
  ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +  stat_mean(aes(color = cluster), size = 4)

# Percentage of variance explained by dimensions

eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1        2.1             71.5                        71.5
## Dim.2        0.6             20.5                        92.0
## Dim.3        0.2              8.0                       100.0

Visualize k-means clusters

Color individuals according to the cluster groups

Add concentration ellipses

Add cluster centroid using the stat_mean() [ggpubr] R function

ggscatter(
  ind.coord, x = "Dim.1", y = "Dim.2", 
  color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
   size = 1.5,  legend = "right", ggtheme = theme_bw(),
  xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
  ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
  stat_mean(aes(color = cluster), size = 4)

library(ggplot2)
head(k_mod6)
## $cluster
##    [1] 5 4 6 6 5 3 5 1 5 6 4 3 4 4 5 4 1 3 1 2 4 6 3 5 3 2 1 4 3 4 6 1 2 5 4 2 3
##   [38] 3 3 4 4 6 1 5 5 2 1 4 6 4 4 1 3 3 1 4 4 1 2 3 1 4 1 3 1 3 4 4 2 4 4 4 3 6
##   [75] 5 4 4 5 2 4 2 3 1 2 4 5 4 2 1 2 6 4 6 3 1 3 3 3 3 1 4 4 4 4 3 4 4 3 2 4 4
##  [112] 6 4 3 5 4 2 2 1 3 2 2 6 4 4 5 2 5 5 1 3 5 1 2 2 1 5 3 4 4 2 4 1 2 4 4 3 2
##  [149] 4 6 2 2 1 3 4 2 4 4 3 2 3 3 5 5 3 5 2 4 3 6 2 1 4 5 4 1 5 2 4 4 2 3 5 5 3
##  [186] 3 4 2 1 2 1 4 4 1 2 2 2 2 1 1 1 1 4 2 4 4 3 6 1 2 2 2 5 3 3 4 3 2 4 1 4 3
##  [223] 2 2 4 3 4 5 1 1 1 4 4 3 3 2 4 4 4 3 1 3 5 3 6 1 6 3 4 1 1 2 5 1 1 2 4 3 3
##  [260] 2 2 4 6 5 5 2 4 1 4 5 2 6 3 4 3 3 4 4 4 4 4 5 4 3 5 3 1 3 2 3 4 3 2 2 6 4
##  [297] 1 4 4 1 4 3 4 1 3 1 2 2 4 4 3 3 3 1 4 4 3 1 4 5 1 5 1 5 5 3 2 1 4 4 4 4 5
##  [334] 6 4 1 1 2 4 6 3 6 1 1 4 1 1 3 3 3 4 3 4 3 3 2 2 2 1 5 1 1 3 2 5 6 1 2 1 1
##  [371] 2 5 1 1 5 1 3 5 3 6 2 3 2 3 5 6 5 2 1 1 4 5 4 4 4 4 3 4 3 1 3 3 1 1 1 4 5
##  [408] 4 1 2 5 3 1 4 1 5 2 1 5 5 6 4 3 2 5 2 3 2 3 3 5 3 5 2 2 5 4 5 3 6 2 4 6 6
##  [445] 6 1 4 5 3 2 4 1 2 3 3 6 5 2 4 3 4 3 4 2 6 4 1 5 2 1 1 4 4 6 3 3 2 4 5 4 2
##  [482] 4 4 2 3 5 3 4 5 1 5 1 2 1 2 3 2 1 6 2 2 2 3 2 3 1 1 4 5 5 4 1 2 5 5 4 4 4
##  [519] 4 3 2 4 4 3 3 6 3 2 3 3 5 2 3 1 3 3 5 1 1 2 4 4 2 4 1 3 2 5 5 2 5 3 5 3 2
##  [556] 2 4 5 5 1 4 4 2 5 4 6 2 5 4 4 2 2 4 6 2 4 4 5 5 4 2 3 5 3 5 3 4 4 4 1 3 3
##  [593] 5 1 1 4 4 2 5 3 5 5 4 1 4 4 3 4 1 4 2 5 2 2 2 4 6 1 2 5 3 6 1 4 2 5 1 3 1
##  [630] 3 4 2 2 3 3 4 3 2 4 2 4 4 5 3 5 2 2 1 2 5 4 4 4 3 3 2 4 4 1 3 4 3 5 5 5 3
##  [667] 1 5 4 6 5 5 2 5 4 4 3 1 3 3 1 1 5 1 5 2 4 2 1 1 1 2 2 3 2 2 1 5 4 6 4 4 4
##  [704] 4 5 3 2 5 2 2 3 2 1 3 2 5 4 1 2 1 5 5 6 2 5 1 1 5 1 6 5 3 4 3 2 3 3 6 4 2
##  [741] 4 4 2 2 3 6 2 4 1 3 1 2 4 3 1 1 4 1 1 4 1 2 2 2 3 2 2 3 1 3 1 1 2 4 3 2 4
##  [778] 2 1 5 1 2 4 3 1 3 4 5 4 1 1 3 2 5 4 1 3 3 1 1 2 2 2 4 3 5 1 3 2 4 2 5 4 4
##  [815] 1 4 2 2 3 3 3 2 4 2 4 1 2 3 4 2 6 4 5 5 3 6 4 2 3 5 3 1 3 4 5 4 2 1 1 1 3
##  [852] 5 4 5 2 5 1 3 2 4 5 3 1 5 1 6 2 1 3 3 1 2 2 2 4 1 4 5 3 4 5 5 2 3 3 4 3 2
##  [889] 4 4 2 4 3 2 4 2 1 2 1 4 4 3 2 2 3 1 3 5 4 4 2 3 2 5 6 3 1 2 4 1 3 5 4 1 2
##  [926] 3 4 6 2 5 1 5 2 4 2 6 3 2 4 2 4 2 4 5 3 2 2 2 3 4 3 4 1 3 5 5 3 5 4 6 6 6
##  [963] 4 4 3 3 3 2 3 1 2 4 5 6 1 4 2 3 4 1 1 2 4 1 3 5 2 1 4 2 2 5 3 1 5 4 3 5 5
## [1000] 6 5 2 2 3 5 4 6 2 5 4 5 4 3 1 1 1 4 2 5 5 1 5 4 5 3 1 2 6 5 4 3 2 3 1 1 2
## [1037] 4 4 1 2 1 3 4 3 1 5 5 1 6 5 1 1 5 4 4 4 3 2 3 5 6 4 4 4 4 4 2 4 5 3 5 4 6
## [1074] 5 2 3 2 2 4 1 1 2 6 5 4 2 2 5 6 5 3 5 2 3 3 4 2 2 1 1 3 3 6 5 3 2 5 6 2 1
## [1111] 3 5 4 5 4 2 2 3 1 3 3 6 3 4 5 2 5 3 4 1 2 2 3 6 4 4 4 1 1 4 2 2 3 4 4 3 4
## [1148] 3 1 2 1 1 5 4 2 1 2 4 2 5 2 1 4 2 4 4 2 2 3 2 5 2 4 3 2 2 2 1 2 3 5 1 1 1
## [1185] 5 2 3 3 6 2 2 5 4 3 1 4 6 2 1 4 5 3 2 2 1 5 4 4 2 5 2 4 5 2 3 2 4 4 2 4 2
## [1222] 4 3 5 3 2 4 4 3 4 5 6 1 4 6 1 1 5 1 2 3 4 3 2 1 1 2 5 2 2 4 4 4 3 3 1 5 3
## [1259] 2 4 5 4 1 5 3 1 4 3 4 2 5 2 1 4 4 5 4 4 3 5 3 5 4 2 3 1 2 1 5 2 1 5 4 4 2
## [1296] 4 2 5 4 3 4 2 4 5 6 3 5 3 4 2 2 4 2 4 3 5 3 5 5 4 2 4 2 1 3 3 1 4 1 4 3 2
## [1333] 3 3 1 5 6 4 2 5 5 3 3 1 5 4 3 2 4 1 5 6 6 5 3 1 2 2 4 1 3 4 4 4 2 5 6 4 4
## [1370] 3 4 1 5 3 5 3 2 2 1 5 6 4 4 2 5 3 4 6 4 2 2 1 4 1 1 1 6 4 3 1 4 3 4 6 2 5
## [1407] 5 4 3 5 5 4 1 1 5 5 4 6 1 4 4 5 3 4 1 1 1 1 5 2 2 3 4 5 4 2 6 4 2 5 3 4 4
## [1444] 4 3 2 3 4 4 2 5 5 4 6 4 4 5 3 5 4 2 1 4 1 5 5 4 1 3 2 4 3 6 1 5 5 4 2 1 2
## [1481] 1 1 2 5 5 1 5 2 5 3 2 3 4 2 6 4 5 5 5 1 5 3 2 3 3 2 5 4 4 3 4 5 4 3 4 1 4
## [1518] 1 4 5 6 4 6 3 1 3 5 3 4 2 4 4 2 3 4 5 2 4 2 4 1 4 2 5 5 5 1 6 3 3 3 4 3 2
## [1555] 4 1 4 2 5 4 5 1 5 4 4 5 1 6 3 3 4 2 2 3 5 2 1 2 1 4 5 2 2 4 3 1 5 4 2 2 1
## [1592] 1 4 4 2 2 4 4 4 4 4 3 2 3 5 1 3 4 2 2 2 2 5 4 4 4 2 3 4 1 6 4 4 5 2 2 2 3
## [1629] 4 5 4 4 4 4 1 1 3 2 1 4 4 2 2 6 3 4 4 1 3 2 5 4 1 5 4 2 2 2 4 1 2 5 4 5 4
## [1666] 3 3 3 4 3 2 1 2 1 4 4 6 2 5 2 1 1 4 1 4 2 1 4 5 3 4 3 5 1 2 2 4 4 2 2 1 3
## [1703] 1 4 5 4 3 2 4 6 1 4 2 4 2 2 4 3 4 2 4 1 1 1 6 3 6 5 4 2 5 4 5 4 5 3 6 1 5
## [1740] 4 5 4 5 4 4 4 2 4 5 3 4 3 4 3 4 2 3 3 4 3 5 4 3 5 1 4 3 4 4 4 2 3 6 1 3 3
## [1777] 1 1 3 2 2 2 4 3 2 1 4 5 2 1 2 2 4 2 4 1 5 3 4 2 1 1 2 5 2 2 3 2 3 2 2 3 5
## [1814] 4 2 2 4 2 1 2 3 2 4 4 5 3 5 1 3 2 2 5 1 3 4 1 3 2 6 2 4 5 4 4 4 5 4 5 2 4
## [1851] 2 4 1 3 3 3 2 2 3 2 2 4 1 4 3 5 3 1 3 4 3 2 2 3 1 1 6 1 5 5 1 3 3 4 1 3 4
## [1888] 3 2 4 3 1 4 2 2 4 3 3 2 3 1 1 3 3 5 2 4 3 5 6 1 4 6 4 2 2 3 2 2 4 2 3 5 1
## [1925] 3 2 4 5 4 5 1 2 2 2 3 4 5 1 2 2 3 4 5 2 1 2 1 2 2 1 3 4 1 4 3 5 4 2 3 2 5
## [1962] 2 2 1 4 4 4 4 3 3 3 3 2 4 1 4 4 2 3 4 4 1 5 2 4 5 4 1 3 5 6 4 5 3 3 1 4 4
## [1999] 3 4 3 5 4 3 6 3 1 5 1 3 1 3 1 4 1 2 5 5 3 4 1 2 1 4 1 2 2 1 2 5 2 3 1 4 3
## [2036] 5 1 1 3 3 4 4 2 3 3 5 5 1 2 6 4 2 4 2 4 6 1 3 1 3 5 3 5 6 2 5 4 3 1 1 2 1
## [2073] 4 4 2 2 3 4 2 4 5 2 4 1 5 2 1 4 5 2 5 4 5 3 1 3 2 2 6 4 1 2 4 6 3 5 2 3 4
## [2110] 5 3 2 3 1 3 5 1 4 3 2 4 5 4 5 5 1 5 1 5 3 4 1 1 5 4 5 3 3 3 6 4 2 2 6 1 5
## [2147] 4 4 3 3 1 3 5 5 1 1 5 5 5 2 1 5 6 2 6 4 3 3 2 2 2 5 5 4 2 2 6 3 4 4 5 4 2
## [2184] 4 6 2 4 1 3 3 5 1 4 3 5 1 2 5 2 5 2 4 3 1 1 2 3 2 1 1 1 2 3 6 1 2 2 2 5 5
## [2221] 2 1 5 4 1 2 4 4 2 5 5 1 4 3 4 2 5 4 6 4 5 4 4 6 5 4 1 2 1 5 5 3 2 6 5 5 5
## [2258] 3 3 1 1 5 3 4 5 4 3 5 3 1 4 2 2 3 5 2 5 4 4 2 3 2 3 3 2 4 1 5 3 5 4 4 5 5
## [2295] 1 3 2 3 4 5 1 1 2 1 5 2 1 4 3 3 2 2 3 1 2 2 3 1 4 1 4 6 5 3 4 3 4 4 2 4 3
## [2332] 4 1 2 4 4 1 1 5 4 5 4 4 2 5 4 2 4 3 2 2 4 2 4 5 4 4 4 4 3 1 5 2 2 5 5 4 2
## [2369] 4 4 3 2 5 3 5 4 2 2 3 5 4 2 2 3 1 4 3 4 3 1 3 5 5 3 4 3 4 1 2 1 5 2 2 4 1
## [2406] 2 2 3 4 3 2 2 3 6 5 3 5 2 3 2 4 3 2 4 3 4 1 1 1 3 4 3 4 2 3 2 5 1 4 4 1 1
## [2443] 3 1 1 1 2 2 1 3 1 3 2 1 2 3 5 5 2 4 2 5 3 5 6 4 4 2 4 3 2 4 3 3 5 4 5 2 5
## [2480] 2 6 1 4 2 1 4 5 4 5 3 4 4 5 3 1 2 1 5 4 3 3 6 5 5 2 4 2 5 3 5 5 1 3 4 2 2
## [2517] 3 2 3 5 3 2 1 4 4 5 2 4 3 3 2 5 2 3 4 2 1 4 2 2 5 5 1 1 2 6 2 6 4 4 1 2 3
## [2554] 2 1 4 6 3 1 3 5 2 4 4 3 3 4 3 4 4 1 1 6 6 4 5 3 3 5 1 6 4 1 2 3 1 4 3 2 2
## [2591] 4 3 2 3 5 2 4 2 3 5 2 4 1 4 2 4 4 4 4 2 6 3 1 4 5 3 3 4 3 1 1 1 3 1 2 2 1
## [2628] 1 5 5 5 3 5 3 3 6 2 4 4 5 4 5 4 5 1 3 1 1 5 5 5 2 4 3 5 2 1 6 5 5 5 5 6 3
## [2665] 2 4 3 3 4 4 2 2 2 4 3 2 5 6 5 4 2 1 3 3 6 4 4 4 1 2 4 2 3 2 4 2 4 4 6 4 3
## [2702] 5 4 2 3 1 4 3 4 4 4 4 2 2 4 3 1 3 2 2 2 2 4 1 1 4 1 3 1 2 3 6 3 3 1 2 4 5
## [2739] 2 3 1 5 3 4 4 5 3 1 1 3 3 2 1 5 4 5 3 2 4 1 1 4 3 1 1 5 3 5 5 6 3 3 6 3 2
## [2776] 5 2 6 3 1 4 5 5 5 4 1 5 2 3 4 3 2 5 5 4 3 6 6 4 2 1 3 5 2 3 2 1 4 5 4 1 1
## [2813] 5 2 4 1 6 4 2 5 3 4 5 5 3 4 6 4 2 1 3 1 1 2 5 2 4 2 6 2 3 2 4 4 3 2 2 4 1
## [2850] 2 4 4 1 2 3 3 3 2 3 6 4 2 3 2 1 2 1 5 6 3 1 3 4 5 5 3 1 3 2 3 2 2 4 4 4 3
## [2887] 1 3 5 2 6 1 5 4 4 1 5 4 3 4 5 4 3 3 1 4 1 1 1 1 2 4 6 5 4 4 5 5 4 3 5 2 5
## [2924] 4 1 1 6 3 2 2 5 4 1 2 5 1 2 2 5 1 2 4 4 2 4 6 2 5 5 1 6 2 3 5 2 3 4 5 3 5
## [2961] 2 4 2 5 3 4 3 4 4 3 2 3 3 2 4 3 4 4 3 3 4 1 1 4 6 3 2 3 3 5 3 1 4 3 4 1 3
## [2998] 1 4 1 5 2 5 2 4 4 2 2 2 2 2 3 3 1 4 2 5 1 4 5 2 5 2 1 2 2 4 4 4 2 3 3 2 2
## [3035] 5 5 1 5 2 2 4 4 5 4 4 5 2 3 4 4 2 4 4 5 1 4 2 4 2 2 3 2 5 1 2 2 3 3 1 3 3
## [3072] 4 5 1 2 1 3 2 2 5 5 4 3 4 2 5 2 2 1 4 4 5 2 2 1 3 6 4 2 1 3 2 5 5 4 4 1 5
## [3109] 1 3 2 4 2 4 3 2 3 5 2 1 2 4 4 1 3 2 5 5 5 5 3 5 5 2 2 4 2 2 3 2 2 5 3 2 2
## [3146] 2 4 4 5 5 3 2 1 2 5 4 5 2 3 5 3 1 4 2 4 3 4 5 3 4 3 5 3 1 1 5 2 2 4 3 1 4
## [3183] 6 4 5 5 2 5 2 3 2 4 4 6 4 3 3 4 5 2 6 3 4 3 4 3 2 2 1 3 1 5 1 4 4 4 3 3 6
## [3220] 4 6 4 4 1 2 5 4 1 4 2 5 4 3 1 4 1 4 4 3 2 3 4 2 5 5 2 4 4 4 4 3 3 3 6 5 2
## [3257] 1 1 5 5 2 3 5 3 1 4 5 1 4 3 1 4 4 2 3 4 3 2 2 4 1 1 4 2 1 2 3 4 5 2 4 1 2
## [3294] 4 1 2 3 1 1 1 5 4 5 2 5 4 4 1 3 4 1 1 2 3 2 5 5 2 5 3 1 5 2 4 1 4 4 3 4 2
## [3331] 2 2 5 1 2 2 2 2 5 1 2 2 5 4 4 2 4 4 4 2 4 2 3 5 2 3 2 1 4 3 1 2 1 3 4 2 1
## [3368] 3 4 4 3 4 2 4 4 5 2 3 3 5 4 4 2 4 3 5 5 4 3 3 4 2 3 3 2 4 4 4 1 4 1 1 3 3
## [3405] 1 5 2 6 1 2 5 2 3 5 5 5 6 1 2 3 2 5 2 6 1 2 4 4 4 6 4 2 2 5 3 5 3 1 5 2 4
## [3442] 3 3 3 3 2 2 3 4 4 2 4 2 2 2 5 4 1 2 2 3 3 1 3 2 5 3 3 4 2 1 1 3 1 4 4 1 3
## [3479] 5 3 3 4 5 4 1 3 3 5 4 4 1 4 4 3 3 2 2 2 2 6 1 4 1 4 4 6 4 4 5 3 3 4 3 6 5
## [3516] 1 4 4 5 2 4 3 5 5 2 1 5 5 1 2 1 5 4 6 1 3 1 4 1 5 3 1 6 2 1 2 2 2 2 3 3 4
## [3553] 4 2 4 5 2 3 5 2 6 5 3 5 1 4 2 6 2 5 6 1 2 4 6 5 3 3 2 4 1 2 2 4 5 4 4 5 4
## [3590] 2 4 3 2 1 1 3 4 1 1 4 2 2 5 2 4 2 4 5 1 4 3 5 2 1 3 5 1 2 3 3 5 1 2 5 1 3
## [3627] 1 1 4 2 1 5 2 1 4 2 4 4 4 3 5 4 3 2 1 3 2 3 1 6 5 5 3 5 6 4 4 6 1 4 4 3 5
## [3664] 2 2 5 2 5 4 4 5 4 1 5 2 3 4 2 4 5 3 2 1 2 5 3 1 2 3 2 1 4 4 5 4 2 3 4 3 1
## [3701] 1 2 4 3 4 2 2 2 2 6 3 2 4 5 1 5 6 3 1 4 6 2 4 3 4 5 1 3 3 3 2 2 4 2 4 2 4
## [3738] 3 3 4 2 4 1 2 1 3 3 4 2 1 5 4 5 4 5 2 5 4 4 4 2 1 5 5 1 3 4 2 4 3 4 5 3 2
## [3775] 6 3 1 2 2 3 2 1 2 6 4 5 2 5 2 1 2 5 3 2 4 1 2 1 1 4 2 4 4 2 5 3 4 5 6 2 3
## [3812] 3 3 5 5 3 3 4 4 5 2 5 5 3 2 2 5 3 5 5 2 5 1 3 1 6 3 5 2 1 2 3 3 4 4 1 4 2
## [3849] 2 5 4 5 3 1 4 5 3 1 1 4 4 4 4 3 3 4 1 2 5 4 2 1 2 4 4 4 4 3 5 5 3 1 2 3 2
## [3886] 3 4 2 4 1 2 3 4 2 2 5 2 1 5 4 1 2 6 1 2 4 4 3 5 6 6 2 4 1 6 3 3 4 2 6 5 3
## [3923] 4 4 2 4 3 4 5 6 2 4 2 4 4 4 5 1 5 5 1 4 1 5 4 1 6 3 1 4 4 5 5 5 2 2 5 5 4
## [3960] 5 4 3 4 6 3 4 3 4 4 3 1 3 3 4 2 2 4 6 5 4 5 5 1 4 5 6 4 4 3 2 3 5 3 1 2 4
## [3997] 4 1 2 2 1 5 4 3 5 2 2 2 1 2 5 4 2 3 1 3 4 4 4 4 2 3 6 4 4 4 2 2 6 5 2 4 3
## [4034] 1 2 2 2 3 2 4 4 3 4 5 5 6 3 1 4 2 5 5 2 6 4 5 1 4 5 4 3 4 5 3 1 4 5 6 1 5
## [4071] 2 5 1 5 4 4 2 4 5 3 5 1 5 2 5 4 2 2 1 2 5 5 2 2 3 5 1 2 1 3 3 3 3 4 4 5 6
## [4108] 5 1 2 2 6 1 3 3 5 3 6 5 6 5 2 5 2 5 6 3 3 4 1 5 2 5 5 4 5 3 4 3 1 1 5 1 5
## [4145] 2 4 4 1 2 5 4 1 1 2 2 5 2 1 5 1 5 4 3 3 5 2 2 6 5 5 5 5 2 2 4 2 5 2 4 5 5
## [4182] 2 5 2 3 3 3 5 2 4 2 3 4 2 5 5 2 4 1 2 2 6 1 1 4 2 2 5 4 1 5 6 5 4 4 1 4 2
## [4219] 2 3 2 1 5 6 2 6 6 6 1 4 4 5 2 4 3 2 1 5 4 2 2 5 5 4 3 4 5 5 1 4 2 4 2 3 5
## [4256] 3 1 3 3 5 4 5 1 1 6 4 5 2 2 2 1 2 2 4 2 1 2 6 2 5 3 1 2 3 3 4 6 2 1 3 1 6
## [4293] 4 3 1 2 5 6 3 5 5 3 5 2 1 4 5 1 5 4 2 2 4 2 1 5 2 2 2 1 4 5 3 1 4 5 4 4 5
## [4330] 3 4 6 4 2 5 4 3 3 1 2 4 6 4 2 1 2 1 6 3 2 5 4 6 4 2 5 3 3 5 5 2 4 2 2 2 2
## [4367] 2 5 5 2 4 6
## 
## $centers
##   mon_bins rec_bins freq_bins
## 1 2.456845 4.694940  2.514881
## 2 1.546137 2.133047  1.566524
## 3 2.753459 1.942138  2.977358
## 4 4.532882 1.663443  4.966151
## 5 1.179666 4.837047  1.178273
## 6 4.687783 2.022624  2.579186
## 
## $totss
## [1] 28750.38
## 
## $withinss
## [1] 1063.0625 1167.3938  988.6088 1482.0754  308.9387  350.2081
## 
## $tot.withinss
## [1] 5360.287
## 
## $betweenss
## [1] 23390.09
head(df_mod6)
##   mon_bins rec_bins freq_bins cluster
## 1        1        5         1       5
## 2        5        1         5       4
## 3        4        3         2       6
## 4        4        2         3       6
## 5        2        5         1       5
## 6        3        2         4       3
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
head(df_mod6)
##   mon_bins rec_bins freq_bins cluster
## 1        1        5         1       5
## 2        5        1         5       4
## 3        4        3         2       6
## 4        4        2         3       6
## 5        2        5         1       5
## 6        3        2         4       3
nrow(df_mod6)
## [1] 4372
plot_ly(x=df_mod6$rec_bins, y=df_mod6$freq_bins, z=df_mod6$mon_bins, type="scatter3d", mode="markers", color=df_mod6$cluster)
head(df_mod6,15)
##    mon_bins rec_bins freq_bins cluster
## 1         1        5         1       5
## 2         5        1         5       4
## 3         4        3         2       6
## 4         4        2         3       6
## 5         2        5         1       5
## 6         3        2         4       3
## 7         1        5         1       5
## 8         3        5         3       1
## 9         2        5         1       5
## 10        5        2         3       6
## 11        5        2         5       4
## 12        3        1         2       3
## 13        5        1         5       4
## 14        5        3         5       4
## 15        1        5         1       5
df_cluster1=df_mod6[df_mod6==1,]
nrow(df_cluster1)
## [1] 3971
df_cluster2=df_mod6[df_mod6==2,]
nrow(df_cluster2)
## [1] 4254
df_cluster3=df_mod6[df_mod6==3,]
nrow(df_cluster3)
## [1] 3559
df_cluster4=df_mod6[df_mod6==4,]
nrow(df_cluster4)
## [1] 1678
df_cluster5=df_mod6[df_mod6==5,]
nrow(df_cluster5)
## [1] 3805
df_cluster6=df_mod6[df_mod6==6,]
nrow(df_cluster6)
## [1] 221

#hierarchical clustering #scaling the variables in hierarchical clustering

df_scale=scale(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],center=TRUE,scale=TRUE)


summary(df_scale)
##     rec_bins         freq_bins          mon_bins      
##  Min.   :-1.1989   Min.   :-1.1800   Min.   :-1.1951  
##  1st Qu.:-1.1989   1st Qu.:-1.1800   1st Qu.:-0.6712  
##  Median :-0.5400   Median :-0.5097   Median :-0.1472  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7778   3rd Qu.: 0.8310   3rd Qu.: 0.3768  
##  Max.   : 1.4367   Max.   : 1.5014   Max.   : 1.5994
dist_mat <- dist(df_scale, method = 'euclidean')

#plotting dendrogram

hclust_avg <- hclust(dist_mat, method = 'ward.D2')
plot(hclust_avg)

#creating the required number of clusters

member = cutree(hclust_avg,6)

h_clust=rfm_bin
h_clust$cluster=member

#plotting the clusters formed using hierarchical clustering

library(cluster)
res.pca <- prcomp(h_clust[,c("mon_bins","rec_bins","freq_bins")],  scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(h_clust$cluster)
# Data inspection



ggscatter(
  ind.coord, x = "Dim.1", y = "Dim.2", 
  color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
  size = 1.5,  legend = "right", ggtheme = theme_bw(),
  xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
  ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
  stat_mean(aes(color = cluster), size = 4)

#shiloutte analysis

df6_scale=data.frame(scale(rfm_bin[,c("mon_bins","rec_bins","freq_bins")],center=TRUE,scale=TRUE))
k_mod6_1=kmeans(df6_scale,centers=6,nstart=25)
k_mod6_1
## K-means clustering with 6 clusters of sizes 829, 872, 1091, 688, 561, 331
## 
## Cluster means:
##      mon_bins   rec_bins   freq_bins
## 1 -0.87489870 -0.3714783 -0.81533013
## 2  1.54976773 -0.8512784  1.36225682
## 3 -0.88392573  1.3279575 -0.83286360
## 4  0.12801777 -0.8655820 -0.11505324
## 5  0.39766913  0.6990949  0.03761136
## 6  0.08183663 -0.5897375  1.37381141
## 
## Clustering vector:
##    [1] 3 2 5 4 3 6 3 5 3 2 2 4 2 2 3 2 3 4 3 1 2 4 5 3 4 4 5 2 5 2 4 5 1 3 2 1 6
##   [38] 5 5 2 2 4 5 3 3 1 5 2 4 2 2 5 5 5 3 2 2 5 1 4 5 2 5 5 3 4 2 2 1 2 2 2 5 5
##   [75] 3 2 2 3 1 2 1 5 3 1 2 3 2 4 5 1 5 2 5 5 5 4 4 4 4 5 2 2 2 2 5 2 2 4 4 2 2
##  [112] 2 2 4 3 2 1 1 3 4 1 1 2 5 2 3 1 3 3 5 4 3 5 1 1 5 3 5 2 2 1 2 3 1 2 2 4 1
##  [149] 2 5 1 1 5 5 2 1 2 2 4 1 5 4 3 3 5 3 1 2 6 4 1 3 2 3 2 3 3 1 2 2 1 4 3 3 4
##  [186] 6 2 1 3 1 3 2 2 5 1 1 1 1 5 3 3 3 2 1 2 2 4 2 3 1 4 1 3 6 5 2 5 1 2 3 2 5
##  [223] 1 1 2 4 2 3 3 3 5 2 2 4 4 1 2 5 2 4 3 4 3 4 5 3 4 4 2 3 3 4 3 5 5 4 2 4 4
##  [260] 1 1 2 4 3 3 1 2 5 2 3 4 2 4 2 5 4 2 2 2 2 2 3 2 4 3 4 5 5 4 4 2 5 1 1 2 2
##  [297] 3 2 2 5 2 4 2 3 4 5 1 4 2 2 5 4 4 3 2 2 5 3 2 3 3 3 5 3 3 5 1 6 2 2 2 2 3
##  [334] 5 2 5 5 1 2 5 6 2 3 5 2 5 3 5 4 4 2 4 2 4 4 1 4 1 3 3 3 3 5 1 3 4 3 1 3 3
##  [371] 1 3 5 3 3 3 4 3 5 5 1 6 4 4 3 4 3 1 3 3 2 3 6 2 2 2 5 6 4 3 4 4 3 3 5 6 3
##  [408] 2 5 1 3 6 3 2 5 3 1 3 3 3 5 6 4 1 3 4 4 1 5 5 3 4 3 1 1 3 2 3 4 2 1 2 2 4
##  [445] 2 3 2 3 4 1 2 3 1 5 4 2 3 1 2 6 2 4 6 1 5 6 5 3 1 3 6 2 2 4 4 4 1 2 3 2 1
##  [482] 2 6 1 4 3 6 2 3 5 3 5 1 3 1 4 1 3 2 1 1 1 4 1 4 3 3 2 3 3 2 3 1 3 3 6 2 2
##  [519] 2 5 1 2 2 5 4 5 4 4 6 4 3 1 5 5 5 4 3 3 3 1 2 2 1 2 3 5 1 3 3 1 3 4 3 4 1
##  [556] 1 2 3 3 5 6 2 1 3 2 5 4 3 2 2 1 1 5 4 1 2 2 3 3 2 1 4 3 4 3 4 6 2 2 3 4 4
##  [593] 3 3 5 2 2 1 3 4 3 3 6 5 2 2 4 2 5 2 1 3 1 1 4 2 4 5 1 3 4 4 5 6 1 3 5 4 3
##  [630] 4 6 1 1 4 4 2 4 1 2 1 2 2 3 4 3 1 1 5 1 3 2 2 2 4 4 1 6 2 3 4 6 5 3 3 3 5
##  [667] 5 3 2 4 3 3 1 3 2 6 5 3 4 4 3 5 3 3 3 1 2 1 3 5 5 1 1 4 1 1 3 3 2 5 2 2 2
##  [704] 2 3 6 1 3 1 1 4 1 5 4 1 3 2 3 1 5 3 3 2 1 3 3 5 3 3 2 3 4 6 4 4 4 4 2 2 4
##  [741] 2 6 1 1 4 2 1 2 3 4 5 4 2 5 3 5 2 5 3 2 3 1 1 1 4 1 4 5 5 5 3 3 1 2 4 1 2
##  [778] 1 3 3 5 1 2 5 5 6 2 3 2 3 3 4 1 3 2 3 4 5 3 3 1 1 1 2 4 3 5 4 4 2 1 3 2 2
##  [815] 5 2 1 1 4 4 5 1 2 1 2 3 1 4 2 1 2 2 3 3 4 4 2 1 4 3 5 3 6 2 3 2 1 3 5 5 4
##  [852] 3 2 3 1 3 3 5 1 2 3 5 5 3 3 2 4 3 4 4 5 1 1 1 2 3 2 3 4 2 3 3 1 5 4 2 5 1
##  [889] 2 6 1 2 4 1 2 1 3 1 3 2 2 4 1 1 6 6 4 3 2 2 1 6 1 3 5 5 3 1 6 3 4 3 2 5 1
##  [926] 4 2 4 4 3 5 3 1 2 4 2 5 1 2 1 6 1 2 3 6 1 1 1 6 6 5 6 3 4 3 3 4 3 5 2 2 5
##  [963] 2 6 5 4 6 1 5 3 1 2 3 4 3 2 1 4 2 5 3 1 2 5 5 3 1 3 2 1 1 3 4 3 3 2 4 3 3
## [1000] 4 3 1 1 4 3 2 2 1 3 2 3 2 4 3 5 3 2 1 3 3 5 3 6 3 4 5 1 5 3 2 4 1 4 5 3 1
## [1037] 2 6 3 1 3 5 2 4 5 3 3 5 5 3 5 5 3 2 2 2 4 1 4 3 5 6 2 2 2 6 1 2 3 4 3 6 4
## [1074] 3 1 4 4 1 2 5 3 1 5 3 2 1 1 3 4 3 4 3 1 4 5 2 1 1 5 3 6 4 4 3 5 1 3 2 1 3
## [1111] 4 3 6 3 2 4 1 4 5 6 6 5 4 2 3 1 3 4 2 3 1 1 4 4 2 2 2 3 3 6 1 1 5 2 2 4 2
## [1148] 4 3 1 3 3 3 2 1 5 1 2 1 3 4 5 2 1 2 2 4 1 4 1 3 1 2 4 1 1 4 3 1 5 3 3 5 3
## [1185] 3 1 4 4 2 1 1 3 2 4 5 5 4 4 3 6 3 5 1 1 5 3 2 2 1 3 1 2 3 1 5 1 2 2 1 2 1
## [1222] 2 5 3 4 1 2 2 4 2 3 4 5 2 2 5 5 3 3 1 5 5 4 4 3 3 4 3 1 1 2 2 2 4 4 3 3 5
## [1259] 1 2 3 6 5 3 4 3 2 4 2 1 3 1 5 2 2 3 2 2 4 3 5 3 6 1 4 3 1 5 3 1 5 3 6 2 1
## [1296] 2 1 3 2 6 2 1 2 3 5 4 3 5 2 1 1 2 4 2 4 3 4 3 3 2 1 2 1 5 5 4 3 2 3 2 4 1
## [1333] 4 4 3 3 4 2 1 3 3 4 5 5 3 2 4 1 2 5 3 5 5 3 4 5 4 1 2 3 5 2 6 2 1 3 2 2 2
## [1370] 5 2 3 3 4 3 5 1 1 3 3 2 2 2 1 3 4 2 4 2 1 1 3 2 5 5 6 2 2 5 5 2 4 6 2 1 3
## [1407] 3 2 4 3 3 2 5 5 3 3 2 2 5 2 2 3 4 2 3 3 5 3 3 1 1 5 2 3 2 1 4 2 1 3 4 2 2
## [1444] 2 4 1 4 2 2 1 3 3 6 5 2 2 3 4 3 2 1 3 6 3 3 3 2 3 4 1 2 4 4 3 3 3 2 1 5 1
## [1481] 5 3 1 3 3 5 3 1 3 5 1 4 2 1 5 2 3 3 3 3 3 4 1 6 4 1 3 2 2 4 2 3 2 5 2 3 2
## [1518] 5 6 3 5 2 5 6 3 4 3 6 2 1 2 6 1 5 2 3 1 2 1 6 3 2 4 3 3 3 5 2 4 4 6 6 4 4
## [1555] 6 3 6 1 3 2 3 5 3 5 2 3 5 2 4 4 2 1 1 5 3 4 3 1 3 2 3 1 1 6 5 3 3 2 1 1 3
## [1592] 3 2 6 1 1 2 2 6 2 2 6 1 4 3 3 5 2 1 1 1 4 3 2 2 2 1 6 2 5 4 2 2 3 1 1 4 4
## [1629] 6 3 2 2 2 2 3 3 4 1 3 6 2 1 1 2 5 2 2 5 4 1 3 2 5 3 6 1 1 1 6 5 1 3 6 3 6
## [1666] 6 6 4 2 4 1 3 1 3 2 2 4 1 3 1 3 3 2 3 6 1 5 2 3 5 6 4 3 3 1 1 2 2 1 1 5 4
## [1703] 3 2 3 6 6 1 2 2 3 6 1 2 1 1 6 6 6 1 2 5 5 5 4 6 2 3 2 1 3 6 3 2 3 4 2 5 3
## [1740] 2 3 6 3 2 2 2 4 6 3 4 2 6 2 4 2 1 4 4 6 4 3 6 4 3 3 2 4 2 2 2 4 4 2 5 4 4
## [1777] 5 3 4 1 1 1 2 4 1 5 2 3 1 3 1 1 2 1 2 5 3 5 2 1 5 3 1 3 1 1 4 1 4 1 1 4 3
## [1814] 2 1 1 2 1 5 1 4 1 2 2 3 4 3 5 5 1 1 3 3 4 2 5 4 1 4 1 6 3 2 2 2 3 2 3 1 2
## [1851] 1 2 3 4 5 4 1 1 4 1 1 2 5 2 4 3 6 3 4 6 5 1 1 4 5 3 5 3 3 3 3 4 5 2 3 4 6
## [1888] 6 1 2 4 3 2 1 4 2 4 4 1 5 5 5 4 4 3 1 2 4 3 5 5 2 5 2 1 1 4 4 1 2 1 4 3 3
## [1925] 5 1 2 3 6 3 5 1 1 1 5 6 3 5 1 1 5 2 3 1 3 1 3 1 1 3 4 5 5 2 6 3 6 1 6 1 3
## [1962] 1 1 3 2 6 2 2 4 5 4 4 1 6 5 2 2 1 6 6 2 3 3 1 2 3 2 3 6 3 4 6 3 4 6 5 2 6
## [1999] 6 2 5 3 2 4 5 6 5 3 5 4 5 5 5 2 5 1 3 3 4 2 3 1 3 2 5 1 1 3 1 3 1 4 5 2 6
## [2036] 3 5 3 4 4 2 2 1 4 4 3 3 3 1 5 2 1 2 1 6 2 3 4 5 4 3 4 3 5 1 3 2 4 5 3 4 3
## [2073] 2 2 1 1 4 2 1 2 3 1 6 3 3 1 3 2 3 1 3 2 3 5 3 4 1 4 2 2 5 1 2 4 4 3 1 6 2
## [2110] 3 5 1 4 3 5 3 3 2 4 1 2 3 6 3 3 3 3 3 3 4 2 3 3 3 5 3 6 4 4 4 6 1 1 5 3 3
## [2147] 2 2 4 4 3 5 3 3 3 5 3 3 3 1 5 3 4 1 2 2 5 4 1 1 1 3 3 2 1 1 2 4 2 2 3 2 4
## [2184] 2 5 1 2 3 5 5 3 3 2 5 3 3 1 3 4 3 1 2 5 3 3 1 4 4 5 3 5 1 5 5 3 1 4 1 3 3
## [2221] 1 3 3 2 5 1 2 2 1 3 3 5 6 5 5 1 3 2 5 2 3 6 5 4 3 2 3 1 3 3 3 4 1 2 3 3 3
## [2258] 4 4 5 3 3 4 6 3 2 6 3 4 3 2 1 1 5 3 1 3 6 6 1 4 1 5 4 1 6 3 3 5 3 2 6 3 3
## [2295] 3 4 1 4 6 3 3 3 1 3 3 1 5 2 5 4 1 1 4 5 4 1 4 5 2 3 2 2 3 4 2 4 2 6 4 2 4
## [2332] 2 3 1 2 6 3 5 3 6 3 2 2 1 3 2 4 2 4 1 1 6 1 2 3 2 6 2 6 4 3 3 4 1 3 3 2 1
## [2369] 2 6 4 1 3 5 3 2 1 1 4 3 6 1 1 4 5 2 4 2 6 5 4 3 3 5 2 4 6 3 4 3 3 1 1 2 5
## [2406] 1 1 4 2 4 1 1 4 5 3 4 3 1 4 1 2 4 1 6 4 2 3 3 5 5 2 4 2 1 5 1 3 3 2 5 3 5
## [2443] 4 3 3 3 1 1 5 4 3 4 1 5 1 6 3 3 1 5 1 3 4 3 2 2 6 1 2 4 1 6 4 4 3 2 3 1 3
## [2480] 1 2 3 2 1 5 6 3 2 3 5 6 2 3 4 3 4 5 3 2 6 4 2 3 3 4 2 1 3 5 3 3 5 4 2 1 1
## [2517] 4 1 4 3 4 1 5 2 2 3 1 2 6 5 1 3 1 4 2 1 3 2 1 1 3 3 5 3 1 2 1 2 2 6 3 1 4
## [2554] 4 3 2 5 6 6 4 3 1 2 2 4 4 5 4 6 2 5 5 4 5 2 3 4 4 3 3 5 6 5 1 5 5 2 4 1 1
## [2591] 2 4 1 5 3 1 2 1 6 3 1 2 5 2 1 6 6 2 6 1 4 4 3 2 3 4 6 2 6 5 3 3 4 3 1 1 3
## [2628] 3 3 3 3 4 3 4 6 2 1 6 6 3 6 3 6 3 3 6 3 3 3 3 3 1 2 4 3 1 3 5 3 3 3 3 5 4
## [2665] 4 2 4 4 2 2 1 1 1 6 4 4 3 4 3 2 1 5 4 5 5 2 6 2 5 1 6 1 4 1 2 1 6 2 4 2 6
## [2702] 3 2 1 4 3 2 6 2 6 6 6 1 1 2 4 3 6 1 1 1 1 2 3 5 2 3 5 5 4 5 4 4 4 5 1 6 3
## [2739] 1 4 5 3 4 2 6 3 4 3 3 6 4 1 3 3 2 3 4 1 2 5 3 6 5 3 5 3 4 3 3 5 4 4 4 4 1
## [2776] 3 1 4 5 3 6 3 3 3 2 3 3 1 5 6 5 1 3 3 2 4 4 5 2 1 3 5 3 1 5 1 3 2 3 2 5 3
## [2813] 3 1 2 5 4 2 4 3 5 2 3 3 4 2 4 2 1 5 4 5 5 1 3 1 2 1 5 1 5 4 2 2 4 1 1 2 3
## [2850] 1 6 2 3 1 4 6 5 1 4 5 2 1 4 1 5 1 5 3 2 6 5 6 6 3 3 5 3 4 1 4 1 1 2 2 2 4
## [2887] 5 4 3 1 5 3 3 2 2 3 3 2 4 6 3 2 5 5 5 6 3 3 3 3 1 6 5 3 6 6 3 3 2 4 3 1 3
## [2924] 2 3 3 4 5 1 1 3 2 5 1 3 5 1 4 3 6 1 2 2 1 6 2 1 3 3 6 2 1 4 3 1 4 6 3 4 3
## [2961] 1 2 1 3 4 2 4 2 6 4 1 4 4 1 6 4 6 6 6 6 6 3 3 2 5 6 1 5 4 3 5 3 2 6 6 5 4
## [2998] 3 2 3 3 1 3 1 2 2 1 1 1 1 1 4 6 5 2 1 3 3 2 3 1 3 1 3 1 1 6 6 2 1 6 4 1 1
## [3035] 3 3 5 3 1 1 5 2 3 2 6 3 1 4 6 6 1 6 6 3 3 2 1 2 1 4 5 1 3 3 1 1 4 5 5 6 6
## [3072] 2 3 3 1 5 4 4 1 3 3 6 4 6 1 3 1 1 5 2 2 3 1 1 3 4 4 6 4 5 4 1 3 3 2 2 5 3
## [3109] 5 4 1 2 1 2 4 4 5 3 4 5 1 2 6 5 5 1 3 3 3 3 5 3 3 1 1 2 1 1 5 1 1 3 4 1 1
## [3146] 1 2 2 3 3 4 1 3 4 3 2 3 1 5 3 5 3 2 1 2 5 2 3 4 2 4 3 4 3 5 3 1 1 2 4 3 2
## [3183] 2 6 3 3 4 3 1 5 4 6 2 4 2 6 4 2 3 1 5 4 2 5 2 4 1 1 3 5 5 3 5 2 2 2 6 4 5
## [3220] 2 4 2 2 5 1 3 2 3 2 1 3 2 5 5 6 3 6 2 5 1 5 2 1 3 3 1 2 6 2 2 4 4 6 4 3 1
## [3257] 5 6 3 3 1 4 3 4 3 2 3 5 2 4 3 6 2 1 5 2 5 1 1 2 5 3 2 1 6 1 6 2 3 1 6 3 1
## [3294] 6 5 1 4 5 6 5 3 6 3 4 3 2 2 3 6 2 3 3 4 4 1 3 3 1 3 4 3 3 1 2 3 2 2 5 2 1
## [3331] 1 1 3 3 4 1 1 1 3 5 1 1 3 6 2 1 2 2 6 1 6 1 5 3 1 6 1 5 6 6 3 1 3 6 6 1 5
## [3368] 4 2 2 5 6 1 6 2 3 1 4 6 3 6 5 1 2 4 3 3 2 4 4 2 1 4 5 1 2 2 2 5 2 3 3 4 5
## [3405] 5 3 1 2 3 1 3 1 6 3 3 3 4 3 4 4 1 3 1 4 5 1 2 2 2 4 2 1 4 3 6 3 4 3 3 1 2
## [3442] 5 4 4 4 1 1 4 2 2 4 6 1 1 1 3 6 5 1 4 5 4 3 5 1 3 4 6 2 4 3 3 5 5 2 2 3 4
## [3479] 3 4 5 2 3 2 3 4 5 3 2 2 5 2 6 4 4 1 1 1 1 2 5 2 6 2 2 2 2 2 3 4 4 6 4 4 3
## [3516] 5 2 6 3 1 6 4 3 3 4 3 3 3 3 1 5 3 2 4 3 4 3 2 3 3 4 5 4 1 3 1 1 1 1 4 6 2
## [3553] 2 1 2 3 1 4 3 1 2 3 4 3 5 6 1 2 1 3 2 5 1 2 4 3 4 6 1 2 5 1 1 2 3 2 6 3 2
## [3590] 1 2 4 1 3 3 4 2 5 5 2 4 1 3 1 6 4 2 3 5 2 5 3 1 5 4 3 5 1 6 4 3 5 1 3 6 4
## [3627] 3 5 2 1 5 3 1 5 6 1 6 6 6 6 3 2 4 1 3 4 1 4 3 2 3 3 5 3 4 2 2 4 5 6 6 4 3
## [3664] 4 1 3 1 3 5 2 3 2 5 3 1 4 2 1 6 3 4 1 5 4 3 4 3 1 6 1 3 2 2 3 6 1 4 6 4 3
## [3701] 3 1 2 4 2 1 1 1 1 4 4 1 2 3 3 3 4 4 5 2 2 1 2 6 5 3 3 4 4 4 4 1 2 1 2 1 2
## [3738] 4 4 2 1 2 3 1 3 4 5 2 1 3 3 2 3 5 3 1 3 2 2 2 1 3 3 3 5 4 6 1 6 5 2 3 4 1
## [3775] 4 4 5 1 1 4 1 3 1 2 2 3 4 3 1 3 4 3 4 4 5 5 1 3 3 2 1 2 2 1 3 4 6 3 4 1 4
## [3812] 4 4 3 3 4 5 2 2 3 1 3 3 4 1 4 3 6 3 3 1 3 3 4 5 2 6 3 4 5 1 4 4 6 2 3 6 1
## [3849] 1 3 2 3 4 3 2 3 4 6 5 2 6 2 2 5 4 2 5 1 3 2 1 3 1 6 2 2 2 6 3 3 4 3 1 4 1
## [3886] 4 2 1 2 3 4 4 2 1 1 3 1 5 3 2 5 1 5 5 1 2 2 4 3 2 5 4 2 3 4 4 4 2 1 2 3 4
## [3923] 2 2 1 2 4 2 3 5 1 2 1 2 2 2 3 5 3 3 5 2 3 3 2 3 4 4 5 2 2 3 3 3 4 1 3 3 2
## [3960] 3 2 4 2 5 5 2 4 2 2 5 3 4 5 2 4 1 2 2 3 2 3 3 5 2 3 4 2 2 6 1 4 3 4 3 1 2
## [3997] 2 5 1 1 6 3 6 4 3 1 1 1 5 1 3 2 1 4 3 4 6 2 6 6 1 4 2 2 6 2 1 4 5 3 1 2 4
## [4034] 5 4 1 1 4 1 6 2 4 2 3 3 5 5 3 5 1 3 3 4 2 2 3 3 2 3 2 6 2 3 4 3 2 3 5 5 3
## [4071] 1 3 3 3 6 6 1 6 3 4 3 5 3 1 3 6 1 1 5 1 3 3 1 1 5 3 3 4 3 5 4 6 5 2 6 3 4
## [4108] 3 3 1 4 4 5 4 5 3 4 2 3 4 3 1 3 1 3 2 5 4 2 3 3 1 3 3 6 3 6 2 4 3 3 3 3 3
## [4145] 1 6 2 3 1 3 6 5 3 1 1 3 1 3 3 5 3 6 4 4 3 1 1 5 3 3 3 3 1 1 6 1 3 1 6 3 3
## [4182] 1 3 1 6 4 6 3 1 2 1 6 2 1 3 3 1 2 3 1 4 2 5 5 2 1 1 3 2 3 3 5 3 2 2 3 2 4
## [4219] 1 4 1 5 3 5 1 2 5 5 3 6 2 3 1 2 4 1 3 3 2 1 4 3 3 6 4 2 3 3 5 2 1 6 1 4 3
## [4256] 4 3 4 5 3 2 3 5 5 2 2 3 1 1 1 5 4 1 6 4 3 1 4 1 3 4 5 1 4 5 2 5 1 3 4 5 4
## [4293] 2 4 3 1 3 2 5 3 3 5 3 1 6 2 3 5 3 2 1 1 2 1 3 3 1 1 1 3 2 3 4 3 2 3 2 2 3
## [4330] 4 2 2 5 1 3 6 4 4 5 1 2 5 2 1 5 1 3 5 6 1 3 2 4 5 1 3 4 5 3 3 1 2 1 1 1 1
## [4367] 1 3 3 1 2 4
## 
## Within cluster sum of squares by cluster:
## [1] 425.9603 365.4587 416.1030 344.4632 577.9936 163.7479
##  (between_SS / total_SS =  82.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
head(df6_scale)
##     mon_bins   rec_bins  freq_bins
## 1 -1.1951472  1.4366635 -1.1800234
## 2  1.5994422 -1.1988522  1.5014019
## 3  0.9007949  0.1189056 -0.5096671
## 4  0.9007949 -0.5399733  0.1606893
## 5 -0.4964998  1.4366635 -1.1800234
## 6  0.2021475 -0.5399733  0.8310456
df6_scale$cluster=k_mod6_1$cluster

head(df6_scale)
##     mon_bins   rec_bins  freq_bins cluster
## 1 -1.1951472  1.4366635 -1.1800234       3
## 2  1.5994422 -1.1988522  1.5014019       2
## 3  0.9007949  0.1189056 -0.5096671       5
## 4  0.9007949 -0.5399733  0.1606893       4
## 5 -0.4964998  1.4366635 -1.1800234       3
## 6  0.2021475 -0.5399733  0.8310456       6
library(cluster)

s =silhouette(df6_scale$cluster, dist(df6_scale[,-4],method = 'euclidean'))

plot(s, col=1:3, border=NA)

the average silhoutte width is .43 ,more it is towards 1 the better

#Silhoutte analysis for hierarchical clustering

head(h_clust)
##   custid mon_bins freq_bins rec_bins tot_score cluster
## 1  12346        1         1        5  2.333333       1
## 2  12347        5         5        1  3.666667       2
## 3  12348        4         2        3  3.000000       3
## 4  12349        4         3        2  3.000000       3
## 5  12350        2         1        5  2.666667       1
## 6  12352        3         4        2  3.000000       3
s =silhouette(h_clust$cluster, dist(df6_scale,method = 'euclidean'))
plot(s, col=1:3, border=NA)

#the average silhoutte width is .33 # which is lesser thaan kmeans

#Increasing the number of clusters further

k_mod8=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=8)

df_mod8=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod8$cluster)
df_mod8$cluster=as.factor(df_mod8$cluster)

Accuracy of the cluster with number of clusters 8=0.8483222

k_mod9=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=9)

df_mod9=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod9$cluster)
df_mod9$cluster=as.factor(df_mod9$cluster)

Within cluster sum of squares by cluster:

[1] 340.9895 117.6919 386.8711 687.8474 926.0398 238.7811 989.9324 343.0778 109.8140

(between_SS / total_SS = 85.6 %)

k_mod10=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=10)

df_mod10=cbind(rfm_bin[,c("custid","rec_bins","freq_bins","mon_bins")],"cluster"=k_mod10$cluster)
df_mod10$cluster=as.factor(df_mod10$cluster)

head(df_mod10)
##   custid rec_bins freq_bins mon_bins cluster
## 1  12346        5         1        1       4
## 2  12347        1         5        5       7
## 3  12348        3         2        4       1
## 4  12349        2         3        4       8
## 5  12350        5         1        2       4
## 6  12352        2         4        3       5

#customer segmentation by clustering into 10 clusters #recency high(new customers) ,frequency low ,monetory all types

df_cluster1=df_mod10[df_mod10$cluster==1,] #(x4-5x) high recency

head(df_cluster1,20)  
##    custid rec_bins freq_bins mon_bins cluster
## 3   12348        3         2        4       1
## 8   12354        5         3        3       1
## 23  12372        3         3        3       1
## 27  12377        5         3        4       1
## 29  12379        3         2        3       1
## 38  12393        3         3        3       1
## 39  12394        3         2        3       1
## 43  12399        4         3        3       1
## 47  12405        5         3        4       1
## 52  12410        5         2        3       1
## 53  12412        3         3        3       1
## 54  12413        3         2        3       1
## 58  12418        4         2        3       1
## 61  12422        4         2        3       1
## 63  12424        5         2        4       1
## 64  12425        3         2        3       1
## 73  12434        3         3        3       1
## 74  12435        3         2        5       1
## 82  12446        3         3        3       1
## 89  12453        4         3        3       1
str(df_cluster1)
## 'data.frame':    480 obs. of  5 variables:
##  $ custid   : num  12348 12354 12372 12377 12379 ...
##  $ rec_bins : num  3 5 3 5 3 3 3 4 5 5 ...
##  $ freq_bins: num  2 3 3 3 2 3 2 3 3 2 ...
##  $ mon_bins : num  4 3 3 4 3 3 3 3 4 3 ...
##  $ cluster  : Factor w/ 10 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
library(plyr)
# count(df_cluster1$rec_bins)
# count(df_cluster1$freq_bins)
# count(df_cluster1$mon_bins)
nrow(df_cluster1)
## [1] 480

#cluster 2 frequency high,recency high, monetory moderate to high,best customers

df_cluster2=df_mod10[df_mod10$cluster==2,] 
tail(df_cluster2,20) 
##      custid rec_bins freq_bins mon_bins cluster
## 2871  16222        5         5        3       2
## 3089  16520        5         5        3       2
## 3224  16714        5         4        3       2
## 3234  16725        5         5        3       2
## 3295  16801        5         5        3       2
## 3358  16889        5         5        3       2
## 3382  16919        5         5        5       2
## 3531  17126        5         5        3       2
## 3634  17282        4         4        4       2
## 3669  17337        5         5        5       2
## 3673  17341        5         5        3       2
## 3725  17406        5         5        5       2
## 3754  17444        5         5        5       2
## 3777  17472        5         5        3       2
## 3795  17504        5         5        5       2
## 4009  17787        5         5        4       2
## 4049  17850        5         5        5       2
## 4282  18168        5         5        3       2
## 4333  18231        5         5        5       2
## 4354  18260        5         5        5       2
nrow(df_cluster2)  
## [1] 50
# count(df_cluster2$rec_bins)
# count(df_cluster2$freq_bins)
# count(df_cluster2$mon_bins)

#cluster 3 average customers , low to average recency, frequency and spending capability

df_cluster3=df_mod10[df_mod10$cluster==3,]
nrow(df_cluster3)   
## [1] 1001
# count(df_cluster3$rec_bins)
# count(df_cluster3$freq_bins)
# count(df_cluster3$mon_bins)

View(df_cluster3) 

#cluster 4 recent buying customers,new

df_cluster4=df_mod10[df_mod10$cluster==4,]
nrow(df_cluster4)
## [1] 1091
head(df_cluster4,15) 
##    custid rec_bins freq_bins mon_bins cluster
## 1   12346        5         1        1       4
## 5   12350        5         1        2       4
## 7   12353        5         1        1       4
## 9   12355        5         1        2       4
## 15  12361        5         1        1       4
## 17  12363        4         2        2       4
## 19  12365        5         2        2       4
## 24  12373        5         1        2       4
## 34  12386        5         1        2       4
## 44  12401        5         1        1       4
## 45  12402        5         1        1       4
## 55  12414        5         2        2       4
## 65  12426        5         2        2       4
## 75  12436        4         1        2       4
## 78  12441        5         1        1       4
# count(df_cluster4$rec_bins)
# count(df_cluster4$freq_bins)
# count(df_cluster4$mon_bins)

#cluster 5 frequently buying customers loyal

df_cluster5=df_mod10[df_mod10$cluster==5,]
nrow(df_cluster5)
## [1] 256
# count(df_cluster5$rec_bins)
# count(df_cluster5$freq_bins)
# count(df_cluster5$mon_bins)


tail(df_cluster5,15) 
##      custid rec_bins freq_bins mon_bins cluster
## 4078  17886        2         5        3       5
## 4102  17917        2         4        2       5
## 4135  17961        2         5        3       5
## 4137  17964        2         5        2       5
## 4146  17974        2         5        3       5
## 4151  17979        2         5        3       5
## 4175  18016        2         5        3       5
## 4179  18022        2         5        3       5
## 4185  18034        2         5        2       5
## 4187  18036        2         4        2       5
## 4192  18043        2         5        2       5
## 4230  18096        1         5        3       5
## 4252  18125        2         5        3       5
## 4274  18156        1         5        3       5
## 4349  18252        2         4        2       5

#cluster 6 high paying customers having high frequency of buying

df_cluster6=df_mod10[df_mod10$cluster==6,]



tail(df_cluster6,15) 
##      custid rec_bins freq_bins mon_bins cluster
## 2582  15827        3         5        4       6
## 2703  16007        2         5        4       6
## 2828  16170        2         5        4       6
## 3041  16455        3         4        4       6
## 3084  16515        3         5        4       6
## 3170  16638        2         4        4       6
## 3288  16791        2         5        4       6
## 3327  16841        2         5        4       6
## 3449  17015        2         5        4       6
## 3469  17043        2         4        4       6
## 3508  17092        2         5        4       6
## 3553  17162        2         5        4       6
## 3635  17284        3         5        4       6
## 3977  17738        2         4        4       6
## 4336  18235        3         5        4       6
nrow(df_cluster6)
## [1] 35
# count(df_cluster6$rec_bins)
# count(df_cluster6$freq_bins)
# count(df_cluster6$mon_bins)

#cluster 7 slipping ….high paying customers having high frequency of buying in the past but dont buy now

df_cluster7=df_mod10[df_mod10$cluster==7,]
tail(df_cluster7,15) 
##      custid rec_bins freq_bins mon_bins cluster
## 4266  18145        1         5        5       7
## 4286  18172        1         5        5       7
## 4293  18180        1         5        4       7
## 4306  18198        1         5        5       7
## 4313  18210        1         5        5       7
## 4321  18219        1         5        5       7
## 4325  18223        1         5        5       7
## 4327  18225        1         5        5       7
## 4328  18226        2         5        5       7
## 4331  18229        1         5        5       7
## 4341  18241        1         5        5       7
## 4343  18245        1         5        5       7
## 4352  18257        2         5        5       7
## 4362  18272        1         5        5       7
## 4371  18283        1         5        5       7
nrow(df_cluster6)
## [1] 35
# count(df_cluster7$rec_bins)
# count(df_cluster7$freq_bins)
# count(df_cluster7$mon_bins)

#cluster 8 low frequency customers who rarely buy but consist of low to high spenders

df_cluster8=df_mod10[df_mod10$cluster==8,]
tail(df_cluster8,15) 
##      custid rec_bins freq_bins mon_bins cluster
## 4265  18144        1         3        5       8
## 4278  18161        1         3        4       8
## 4281  18167        1         3        3       8
## 4284  18170        2         2        3       8
## 4290  18177        2         3        3       8
## 4292  18179        1         3        4       8
## 4298  18188        1         3        5       8
## 4323  18221        2         3        3       8
## 4330  18228        2         2        3       8
## 4332  18230        1         3        5       8
## 4337  18236        2         3        3       8
## 4338  18237        1         3        3       8
## 4353  18259        2         2        5       8
## 4357  18263        2         3        3       8
## 4372  18287        2         3        4       8
nrow(df_cluster8)
## [1] 607
# count(df_cluster8$rec_bins)
# count(df_cluster8$freq_bins)
# count(df_cluster8$mon_bins)

#cluster 9 recent buyers frquent buyers, high spenders ,best customers

df_cluster9=df_mod10[df_mod10$cluster==9,]

tail(df_cluster9,15) 
##      custid rec_bins freq_bins mon_bins cluster
## 3348  16875        3         5        5       9
## 3388  16928        3         5        5       9
## 3397  16940        3         5        5       9
## 3428  16984        3         5        5       9
## 3476  17050        4         5        5       9
## 3484  17061        3         4        5       9
## 3720  17400        4         5        5       9
## 3800  17509        3         5        5       9
## 3860  17589        3         5        5       9
## 3862  17591        3         5        5       9
## 3876  17612        3         5        5       9
## 3918  17667        3         5        5       9
## 3963  17722        4         5        5       9
## 4060  17863        3         5        5       9
## 4310  18204        3         5        5       9
nrow(df_cluster9)
## [1] 91
# count(df_cluster9$rec_bins)
# count(df_cluster9$freq_bins)
# count(df_cluster9$mon_bins)

cluster 10 low spenders, frequent buyers

df_cluster10=df_mod10[df_mod10$cluster==10,]
tail(df_cluster10,15) 
##      custid rec_bins freq_bins mon_bins cluster
## 3587  17214        3         5        3      10
## 3622  17259        5         5        2      10
## 3625  17265        4         5        2      10
## 3858  17585        4         5        3      10
## 3878  17614        3         4        2      10
## 4001  17774        4         5        3      10
## 4020  17802        3         5        3      10
## 4082  17890        5         4        2      10
## 4086  17894        3         5        3      10
## 4089  17897        5         5        2      10
## 4105  17921        3         5        3      10
## 4162  17997        3         5        3      10
## 4244  18116        3         5        3      10
## 4291  18178        4         4        3      10
## 4305  18196        4         5        3      10
nrow(df_cluster10)
## [1] 72
# count(df_cluster10$rec_bins)
# count(df_cluster10$freq_bins)
# count(df_cluster10$mon_bins)

#high valued customers have RFM score of 5,5,5

df_mod10$customerID=rfm_bin$custid

df_mod10[ df_mod6$rec_bins==5 & df_mod6$freq_bins==5 & df_mod6$mon_bins==5 ,]
##      custid rec_bins freq_bins mon_bins cluster customerID
## 124   12501        5         5        5       2      12501
## 573   13093        5         5        5       2      13093
## 1196  13952        5         5        5       2      13952
## 1242  14016        5         5        5       2      14016
## 1564  14461        5         5        5       2      14461
## 2135  15235        5         5        5       2      15235
## 2243  15379        5         5        5       2      15379
## 2460  15665        5         5        5       2      15665
## 2567  15808        5         5        5       2      15808
## 3382  16919        5         5        5       2      16919
## 3669  17337        5         5        5       2      17337
## 3725  17406        5         5        5       2      17406
## 3754  17444        5         5        5       2      17444
## 3795  17504        5         5        5       2      17504
## 4049  17850        5         5        5       2      17850
## 4333  18231        5         5        5       2      18231
## 4354  18260        5         5        5       2      18260

#plot with 10 clusters

res.pca <- prcomp(df_mod10[,c("mon_bins","rec_bins","freq_bins")],  scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(df_mod10$cluster)
ggscatter(
  ind.coord, x = "Dim.1", y = "Dim.2", 
  color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
  size = 1.5,  legend = "right", ggtheme = theme_bw(),
  xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
  ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
  stat_mean(aes(color = cluster), size = 4)